beerData<-read_csv("./Data/Beers.csv")
## Parsed with column specification:
## cols(
## Name = col_character(),
## Beer_ID = col_integer(),
## ABV = col_double(),
## IBU = col_integer(),
## Brewery_id = col_integer(),
## Style = col_character(),
## Ounces = col_double()
## )
breweryData<-read_csv("./Data/Breweries.csv")
## Parsed with column specification:
## cols(
## Brew_ID = col_integer(),
## Name = col_character(),
## City = col_character(),
## State = col_character()
## )
###b. What are the dimensions of the data?
glimpse(beerData)
## Observations: 2,410
## Variables: 7
## $ Name <chr> "Pub Beer", "Devil's Cup", "Rise of the Phoenix", "Si…
## $ Beer_ID <int> 1436, 2265, 2264, 2263, 2262, 2261, 2260, 2259, 2258,…
## $ ABV <dbl> 0.050, 0.066, 0.071, 0.090, 0.075, 0.077, 0.045, 0.06…
## $ IBU <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Brewery_id <int> 409, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178…
## $ Style <chr> "American Pale Lager", "American Pale Ale (APA)", "Am…
## $ Ounces <dbl> 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0,…
summary(beerData)
## Name Beer_ID ABV IBU
## Length:2410 Min. : 1.0 Min. :0.00100 Min. : 4.00
## Class :character 1st Qu.: 808.2 1st Qu.:0.05000 1st Qu.: 21.00
## Mode :character Median :1453.5 Median :0.05600 Median : 35.00
## Mean :1431.1 Mean :0.05977 Mean : 42.71
## 3rd Qu.:2075.8 3rd Qu.:0.06700 3rd Qu.: 64.00
## Max. :2692.0 Max. :0.12800 Max. :138.00
## NA's :62 NA's :1005
## Brewery_id Style Ounces
## Min. : 1.0 Length:2410 Min. : 8.40
## 1st Qu.: 94.0 Class :character 1st Qu.:12.00
## Median :206.0 Mode :character Median :12.00
## Mean :232.7 Mean :13.59
## 3rd Qu.:367.0 3rd Qu.:16.00
## Max. :558.0 Max. :32.00
##
#Name Beer_ID ABV IBU Brewery_id Style Ounces
#beer summarized by style
beerData %>%
group_by(Style) %>%
summarise(
count= n(),
IBU_count = sum(!is.na(IBU)),
IBU_average = mean(IBU, na.rm = TRUE),
IBU_sd = sd(IBU, na.rm = TRUE),
ABV_count = sum(!is.na(ABV)),
ABV_average = mean(ABV, na.rm = TRUE),
ABV_sd = sd(ABV, na.rm = TRUE),
Ounces_count = sum(!is.na(Ounces)),
Ounces_average = mean(Ounces, na.rm = TRUE),
Ounces_sd = sd(Ounces, na.rm = TRUE)
)
## # A tibble: 100 x 11
## Style count IBU_count IBU_average IBU_sd ABV_count ABV_average ABV_sd
## <chr> <int> <int> <dbl> <dbl> <int> <dbl> <dbl>
## 1 <NA> 5 2 24 4.24 2 0.0565 0.00495
## 2 Abbe… 2 2 22 0 2 0.049 0
## 3 Altb… 13 8 34.1 8.34 13 0.0544 0.00690
## 4 Amer… 18 11 11 3.77 18 0.0487 0.00431
## 5 Amer… 133 77 36.3 19.6 125 0.0575 0.00868
## 6 Amer… 29 16 23.2 9.39 28 0.0495 0.00464
## 7 Amer… 3 2 96 5.66 3 0.099 0
## 8 Amer… 36 20 68.9 21.4 35 0.0690 0.0126
## 9 Amer… 108 61 21.0 7.28 104 0.0496 0.00494
## 10 Amer… 70 38 29.9 12.3 68 0.0579 0.00971
## # … with 90 more rows, and 3 more variables: Ounces_count <int>,
## # Ounces_average <dbl>, Ounces_sd <dbl>
#Percentage of missing values in dataset
paste0("The total number of beers is: ", nrow(beerData))
## [1] "The total number of beers is: 2410"
percent_missing<-beerData %>%
summarise_all(funs(100*sum(is.na(.))/nrow(beerData))) %>% #count the number of NAs or blanks divided by total rows
round(2) %>%
lapply(function(x){paste0(x,"%")}) %>%
as_tibble()
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
##
## # Before:
## funs(name = f(.))
##
## # After:
## list(name = ~ f(.))
## This warning is displayed once per session.
missing_info <- paste(' ABV >', percent_missing$ABV, ' IBU >', percent_missing$IBU, ' Style >', percent_missing$Style)
# ABV, IBU, Style have missing values
#hmm what should be done about the missing values...
glimpse(breweryData)
## Observations: 558
## Variables: 4
## $ Brew_ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ Name <chr> "NorthGate Brewing", "Against the Grain Brewery", "Jack'…
## $ City <chr> "Minneapolis", "Louisville", "Framingham", "San Diego", …
## $ State <chr> "MN", "KY", "MA", "CA", "CA", "SC", "CO", "MI", "MI", "M…
summary(breweryData)
## Brew_ID Name City State
## Min. : 1.0 Length:558 Length:558 Length:558
## 1st Qu.:140.2 Class :character Class :character Class :character
## Median :279.5 Mode :character Mode :character Mode :character
## Mean :279.5
## 3rd Qu.:418.8
## Max. :558.0
# Brew_ID Name City State
#Report the number of NA's in each column.
breweryData %>%
summarise_all(funs(sum(is.na(.))))#count the number of NAs or blanks
## # A tibble: 1 x 4
## Brew_ID Name City State
## <int> <int> <int> <int>
## 1 0 0 0 0
#no missing values
library("httr")
library("jsonlite")
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
#2018 projected population data
pop_url <- "https://api.census.gov/data/2018/pep/population?get=POP,GEONAME&for=state"
statepop_raw<- pop_url %>%
GET() %>%
content("text") %>%
fromJSON(flatten=TRUE) %>%
as_tibble() %>%
slice(2:n())#remove the first row which is the labels
## Warning: `as_tibble.matrix()` requires a matrix with column names or a `.name_repair` argument. Using compatibility `.name_repair`.
## This warning is displayed once per session.
names(statepop_raw)<-c("Population","State","State_ID")
statepop_raw$Population<- as.integer(statepop_raw$Population)
#add state abbreviations and DC/puerto rico
statepop_raw$State_Abb <- c(sort(c("DC",state.abb)),"PR")
library(rvest)
## Warning: package 'rvest' was built under R version 3.5.2
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
##
## pluck
## The following object is masked from 'package:readr':
##
## guess_encoding
area_url <-"https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_area"
statearea<- area_url %>%
read_html() %>%
html_node('#mw-content-text > div > table:nth-child(6)') %>%
html_table(header = F) %>%
slice(3:n()) %>% #remove the two rows which are labels
select(X1,X6) %>% #keep the state area excluding water
rename("State_name"="X1","Square_miles"="X6") %>%
mutate(Square_miles=as.numeric(gsub(",", "", Square_miles))) #remove thousands separator ","
Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file.
beer_brewery<-beerData %>%
left_join(breweryData, by=c("Brewery_id"="Brew_ID"), suffix = c(".beer", ".brewery")) %>%
rename(Beer_name=Name.beer, Brewery_name=Name.brewery) %>%
#separate continuous variables into 3 groups, low/medium/high
mutate(ABV_rating= cut_number(ABV, n = 3,labels=c("low","medium","high"))) %>%
mutate(IBU_rating= cut_number(IBU, n = 3,labels=c("low","medium","high")))
beer_brewery %>%
filter(is.na(Brewery_name))
## # A tibble: 0 x 12
## # … with 12 variables: Beer_name <chr>, Beer_ID <int>, ABV <dbl>,
## # IBU <int>, Brewery_id <int>, Style <chr>, Ounces <dbl>,
## # Brewery_name <chr>, City <chr>, State <chr>, ABV_rating <fct>,
## # IBU_rating <fct>
#returns no results so all beers have a brewery
#Post merge data peak
first_combo_brewery<-head(beer_brewery,6)
last_combo_brewery<-tail(beer_brewery,6)
## Warning: package 'tidystringdist' was built under R version 3.5.2
## Warning in do_dist(a = b, b = a, method = method, weight = weight, q =
## q, : Non-printable ascii or non-ascii characters in soundex. Results may be
## unreliable. See ?printable_ascii.
## # A tibble: 2,653,056 x 12
## V1 V2 osa lv dl hamming lcs qgram cosine jaccard jw
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Brew… Brew… 1 1 1 1 2 2 0.0147 0.105 0.0238
## 2 Boon… Boon… 1 1 1 1 2 2 0.0278 0 0.0303
## 3 Broo… Broo… 1 1 1 1 2 2 0.0270 0.105 0.0290
## 4 Cald… Cald… 1 1 1 1 2 2 0.0417 0.125 0.0370
## 5 Cald… Cald… 1 1 1 1 2 2 0.0204 0 0.0290
## 6 Cald… Cald… 1 1 1 1 2 2 0.0204 0.133 0.0290
## 7 Cald… Cald… 1 1 1 1 2 2 0.0204 0.133 0.0290
## 8 Cald… Cald… 1 1 1 1 2 2 0.0204 0.133 0.0290
## 9 Indi… Indi… 1 1 1 1 2 2 0.0426 0 0.137
## 10 Vert… Vort… 1 1 1 1 2 2 0.0871 0.100 0.122
## # … with 2,653,046 more rows, and 1 more variable: soundex <dbl>
## Warning in do_dist(a = b, b = a, method = method, weight = weight, q =
## q, : Non-printable ascii or non-ascii characters in soundex. Results may be
## unreliable. See ?printable_ascii.
## # A tibble: 4,950 x 12
## V1 V2 osa lv dl hamming lcs qgram cosine jaccard jw
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Altb… Witb… 2 2 2 2 4 4 0.244 0.375 0.190
## 2 Cider Other 3 3 3 3 6 6 0.6 0.75 0.4
## 3 Cider Radl… 3 3 3 Inf 5 5 0.452 0.625 0.3
## 4 Belg… Belg… 3 3 3 3 6 6 0.0979 0.308 0.176
## 5 Amer… Amer… 3 3 3 3 6 6 0.0607 0.188 0.170
## 6 Belg… Belg… 3 3 3 3 6 6 0.0660 0.188 0.120
## 7 Gose Bock 3 3 3 3 6 6 0.75 0.857 0.5
## 8 Crea… Whea… 3 3 3 3 6 6 0.273 0.545 0.222
## 9 Euro… Euro… 3 3 3 3 6 6 0.124 0.308 0.203
## 10 Engl… Engl… 3 3 3 3 6 6 0.0720 0.235 0.179
## # … with 4,940 more rows, and 1 more variable: soundex <dbl>
## Warning in do_dist(a = b, b = a, method = method, weight = weight, q =
## q, : Non-printable ascii or non-ascii characters in soundex. Results may be
## unreliable. See ?printable_ascii.
## # A tibble: 151,525 x 12
## V1 V2 osa lv dl hamming lcs qgram cosine jaccard jw
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Agai… Agai… 1 1 1 1 2 2 0.0198 0.0625 0.0267
## 2 Rive… Rive… 1 1 1 Inf 1 1 0.00949 0 0.0395
## 3 Ashe… Aspe… 2 2 2 2 4 4 0.0626 0.0588 0.178
## 4 Asla… Aspe… 2 2 2 2 4 4 0.0620 0.0588 0.107
## 5 Aver… Lave… 2 2 2 Inf 3 3 0.0451 0.118 0.0462
## 6 Fate… Matt… 2 2 2 2 4 4 0.0714 0.118 0.104
## 7 Kalo… Kona… 2 2 2 Inf 2 2 0.0250 0.0625 0.0553
## 8 Kona… NoDa… 2 2 2 2 4 4 0.0646 0.176 0.215
## 9 Matt… Maui… 2 2 2 2 4 4 0.107 0.118 0.122
## 10 Maui… Miam… 2 2 2 Inf 3 3 0.0417 0.0625 0.119
## # … with 151,515 more rows, and 1 more variable: soundex <dbl>
Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.
#Summary data by State
beer_brewery_byState<-beer_brewery %>%
group_by(State) %>%
summarise(
Number_of_beers= n(),
Number_of_breweries=length(unique(Brewery_name)),
Number_of_styles=length(unique(Style)),
IBU_count = sum(!is.na(IBU)),
IBU_average = mean(IBU, na.rm = TRUE),
IBU_median = median(IBU, na.rm = TRUE),
IBU_sd = sd(IBU, na.rm = TRUE),
ABV_count = sum(!is.na(ABV)),
ABV_average = mean(ABV, na.rm = TRUE),
ABV_median = median(ABV, na.rm = TRUE),
ABV_sd = sd(ABV, na.rm = TRUE),
Ounces_count = sum(!is.na(Ounces)),
Ounces_average = mean(Ounces, na.rm = TRUE),
Ounces_median = median(Ounces, na.rm = TRUE),
Ounces_sd = sd(Ounces, na.rm = TRUE),
)
#add counts for Ounces
beer_brewery_byState<-beer_brewery %>%
filter(!is.na(Ounces)) %>%
count(State,Ounces) %>%
spread(Ounces,n, fill=0,sep = "_") %>%
right_join(beer_brewery_byState, by="State")
#add category counts for ABV_rating
beer_brewery_byState<-beer_brewery %>%
filter(!is.na(ABV_rating)) %>%
count(State,ABV_rating) %>%
spread(ABV_rating,n, fill=0,sep = "_") %>%
right_join(beer_brewery_byState, by="State")
#add category counts for IBU_rating
beer_brewery_byState<-beer_brewery %>%
filter(!is.na(IBU_rating)) %>%
count(State,IBU_rating) %>%
spread(IBU_rating,n, fill=0,sep = "_") %>%
right_join(beer_brewery_byState, by="State")
#add in population values
beer_brewery_byState<-beer_brewery_byState %>%
left_join(y=statepop_raw, by=c("State"="State_Abb")) %>%
select(c(-State_ID)) %>%
rename(State_name=State.y) %>%
mutate(breweries_per_million_person=(1000000*Number_of_breweries/Population))
#add in state area in Square_miles
beer_brewery_byState<-beer_brewery_byState %>%
left_join(y=statearea, by="State_name") %>%
mutate(breweries_per_thousand_sqmiles=(1000*Number_of_breweries/Square_miles))
#add ranks
beer_brewery_byState<-beer_brewery_byState %>%
mutate(ABV_rank=rank(desc(ABV_median)),
ABV_sd_rank=rank(desc(ABV_sd)),
IBU_rank=rank(desc(IBU_median)),
IBU_sd_rank=rank(desc(IBU_sd)),
Ounce_rank=rank(desc(Ounces_median)),
Ounce_sd=rank(desc(Ounces_sd))
)
How many breweries in each state
brewstate<-beer_brewery_byState %>%
arrange(desc(Number_of_breweries)) %>%
select(State_name, Number_of_breweries, Number_of_beers, Number_of_styles) %>%
rename_all(funs(str_replace_all(., "_", " ")))
top_brewstate <- head(brewstate,10)
bottom_brewstate <- tail(brewstate,10)
top_brewstate
## # A tibble: 10 x 4
## `State name` `Number of breweries` `Number of beers` `Number of styles`
## <chr> <int> <int> <int>
## 1 Colorado 46 265 61
## 2 California 39 183 46
## 3 Michigan 32 162 46
## 4 Oregon 29 125 29
## 5 Texas 28 130 46
## 6 Pennsylvania 24 100 42
## 7 Maine 23 82 33
## 8 Washington 23 68 23
## 9 Iowa 22 139 47
## 10 Nebraska 19 59 25
bottom_brewstate
## # A tibble: 10 x 4
## `State name` `Number of breweri… `Number of beer… `Number of style…
## <chr> <int> <int> <int>
## 1 New Mexico 3 8 6
## 2 Tennessee 3 6 5
## 3 Arizona 2 5 5
## 4 District of Colu… 2 2 2
## 5 Missouri 2 11 8
## 6 North Carolina 2 11 10
## 7 Delaware 1 8 8
## 8 Nevada 1 3 3
## 9 South Dakota 1 7 7
## 10 Wisconsin 1 2 2
states <- map_data("state")
map_data<-beer_brewery_byState %>%
mutate(State_name=tolower(State_name)) %>% #to match the state mapping data
right_join(states, by=c("State_name"="region"))
state_base <- ggplot(data = map_data, mapping = aes(x = long, y = lat, group = group)) +
geom_polygon(color = "black", fill = "gray") +
theme_void()
state_base +
geom_polygon(data = map_data, aes(fill = Population)) #TODO(BJH) think about breaking into subregions

library(grid)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
#Example on how to arrange multiple plots
#MISC
state_base + geom_polygon(data = map_data, aes(fill = Number_of_styles)) +
scale_fill_viridis(option="D")

state_base + geom_polygon(data = map_data, aes(fill = Ounces_median)) +
scale_fill_viridis(option="D")

#General Statistics
grid.arrange(
state_base + geom_polygon(data = map_data, aes(fill = Number_of_breweries)) +
scale_fill_viridis(option="D")+ labs(fill = "# of Breweries"),
state_base + geom_polygon(data = map_data, aes(fill = Number_of_beers)) +
scale_fill_viridis(option="D")+ labs(fill = "# of Beers"),
state_base + geom_polygon(data = map_data, aes(fill = breweries_per_million_person)) +
scale_fill_viridis(option="D")+ labs(fill = "Breweries/Mil People"),
state_base + geom_polygon(data = map_data, aes(fill = Population/1000000)) + labs(fill = "Population (millions)") + scale_fill_viridis(option="D"),
ncol=2, nrow=2
) %>%
ggsave(filename='Export/general_statistics_map.jpg')
## Saving 7 x 5 in image

#ABV data mapped
grid.arrange(
state_base + geom_polygon(data = map_data, aes(fill = ABV_rating_high)) + scale_fill_viridis(option="D") +
labs(fill = "High ABV Beers"),
# state_base + geom_polygon(data = map_data, aes(fill = ABV_rating_medium)) + scale_fill_viridis(option="D"),
state_base + geom_polygon(data = map_data, aes(fill = ABV_rating_low))+ scale_fill_viridis(option="D") +
labs(fill = "Low ABV Beers"),
state_base + geom_polygon(data = map_data, aes(fill = ABV_sd)) + scale_fill_viridis(option="D") +
labs(fill = "ABV Standard Deviation"),
state_base + geom_polygon(data = map_data, aes(fill = ABV_median)) + scale_fill_viridis(option="D")+
labs(fill = "ABV Median"),
ncol=2, nrow=2
) %>%
ggsave(filename='Export/ABV_beers_map.jpg')
## Saving 7 x 5 in image

#IBU data mapped
grid.arrange(
state_base + geom_polygon(data = map_data, aes(fill = IBU_rating_high)) + scale_fill_viridis(option="D") +
labs(fill = "High IBU Beers"),
# state_base + geom_polygon(data = map_data, aes(fill = IBU_rating_medium)) + scale_fill_viridis(option="D"),
state_base + geom_polygon(data = map_data, aes(fill = IBU_rating_low))+ scale_fill_viridis(option="D") +
labs(fill = "Low IBU Beers"),
state_base + geom_polygon(data = map_data, aes(fill = IBU_sd)) + scale_fill_viridis(option="D") +
labs(fill = "IBU Std. Dev."),
state_base + geom_polygon(data = map_data, aes(fill = IBU_median)) + scale_fill_viridis(option="D")+
labs(fill = "IBU Median"),
ncol=2, nrow=2
) %>%
ggsave(filename='Export/IBU_beers_map.jpg')
## Saving 7 x 5 in image

#MAPPING ABV BY STATE
grid.arrange(
#HIGH ABV Beers per million people
state_base +
geom_polygon(data = map_data, aes(fill = ABV_rating_high)) +
scale_fill_continuous(name = "High ABV Beers"),
#MEDIUM ABV Beers per million people
# state_base +
# geom_polygon(data = map_data, aes(fill = ABV_rating_medium)) +
# scale_fill_continuous(name = "Medium ABV Beers"),
#LOW ABV Beers per million people
state_base +
geom_polygon(data = map_data, aes(fill = ABV_rating_low)) +
scale_fill_continuous(name = "Low ABV Beers"),
#HIGH ABV Beers per million people
state_base +
geom_polygon(data = map_data, aes(fill = (1000000*ABV_rating_high)/Population))+
scale_fill_continuous(name = "High ABV per mil."),
#MEDIUM ABV Beers per million people
# state_base +
# geom_polygon(data = map_data, aes(fill = (1000000*ABV_rating_medium)/Population))+
# scale_fill_continuous(name = "Medium ABV beers per million people"),
#LOW ABV Beers per million people
state_base +
geom_polygon(data = map_data, aes(fill = (1000000*ABV_rating_low)/Population))+
scale_fill_continuous(name = "Low ABV beers per mil.")
,ncol=2, nrow=2
) %>%
ggsave(filename = "Export/ABV_by_state.jpg")
## Saving 7 x 5 in image

state_base +
geom_polygon(data = map_data, aes(fill = ABV_sd))

ggsave('Export/ABV_SD_data.jpg')
## Saving 7 x 5 in image
state_base +
geom_polygon(data = map_data, aes(fill = IBU_rating_high))

ggsave('Export/ibu_high_data.jpg')
## Saving 7 x 5 in image
state_base +
geom_polygon(data = map_data, aes(fill = IBU_rating_medium))

ggsave('Export/ibu_med_data.jpg')
## Saving 7 x 5 in image
state_base +
geom_polygon(data = map_data, aes(fill = IBU_rating_low))

ggsave('Export/ibu_low_data.jpg')
## Saving 7 x 5 in image
state_base +
geom_polygon(data = map_data, aes(fill = IBU_sd))

ggsave('Export/ibu_SD_data.jpg')
## Saving 7 x 5 in image
#top 10 states for having high IBU beers
top_ibu<-beer_brewery_byState %>%
arrange(desc(IBU_rating_high)) %>%
head(10)
#bottom 10 states for having high IBU beers
bottom_ibu<-beer_brewery_byState %>%
arrange(desc(IBU_rating_high)) %>%
tail(10)
#barplot of median IBU by state, maybe do the top 10 and bottom 10 becuase it's crowded
top_ibu<-beer_brewery_byState %>%
arrange(IBU_median) %>%
mutate(State=factor(State, levels=State)) %>%
head(10) %>%
ggplot(aes(x=State, y=IBU_median)) + geom_col()+
labs(title="States with Highest IBU",
subtitle="", y="Median IBU")
ggsave('Export/Hi_median_alch.jpg')
## Saving 7 x 5 in image
beer_brewery_byState %>%
arrange(desc(IBU_median)) %>%
mutate(State=factor(State, levels=State)) %>%
head(10) %>%
ggplot(aes(x=State, y=IBU_median)) + geom_col() +
labs(title="States with Lowest IBU",
y="IBU")

ggsave('Export/Low_median_alch.jpg')
## Saving 7 x 5 in image
#barplot of median ABV by state, maybe do the top 10 and bottom 10 because it's crowded
beer_brewery_byState %>%
arrange(ABV_median) %>% #lowest to highest
mutate(State=factor(State, levels=State)) %>%# To set the order appropriately for the plot
head(10) %>%
ggplot(aes(x=State, y=ABV_median)) + geom_col() +
coord_flip()+
labs(title="States with Lowest ABV",
y="ABV")

ggsave('Export/bottom_abv_summary.jpg')
## Saving 7 x 5 in image
beer_brewery_byState %>%
arrange(desc(ABV_median)) %>% #highest to lowers
mutate(State=factor(State, levels=State)) %>% # To set the order appropriately for the plot
head(10) %>%
ggplot(aes(x=State, y=ABV_median)) + geom_col() +
coord_flip() +
labs(title="States with Highest ABV",
y="ABV")

ggsave('Export/top_abv_summary.jpg')
## Saving 7 x 5 in image
#BASIC POSITIVE CORRELATION
beer_brewery %>%
ggplot(aes(x=IBU, y=ABV)) + geom_point()+
labs(title="ABV and IBU Scatterplot")
## Warning: Removed 1005 rows containing missing values (geom_point).

ggsave('Export/ibu_abv_scat_before.jpg')
## Saving 7 x 5 in image
## Warning: Removed 1005 rows containing missing values (geom_point).
#WITH MARKER INDICATING MARKET OPPORTUNITY
beer_brewery %>%
ggplot(aes(x=IBU, y=ABV)) +
geom_point() +
annotate("rect", xmin=c(50), xmax=c(120), ymin=c(.035) , ymax=c(.06), alpha=0.2, color="blue", fill="blue") +
annotate("text", label="Opportunity", x=c(70), y=c(.04), color="Blue")+
labs(title="ABV and IBU Scatterplot",
subtitle="Market Opportunity")
## Warning: Removed 1005 rows containing missing values (geom_point).

ggsave('Export/ibu_abv_scat.jpg')
## Saving 7 x 5 in image
## Warning: Removed 1005 rows containing missing values (geom_point).
beer_brewery %>%
ggplot(aes(x=Ounces, y=IBU)) + geom_point(position = "jitter")+
labs(title="IBU and OUNCES Scatterplot")
## Warning: Removed 1005 rows containing missing values (geom_point).

ggsave(filename="Export/ounce_IBU.jpg")
## Saving 7 x 5 in image
## Warning: Removed 1005 rows containing missing values (geom_point).
beer_brewery %>%
ggplot(aes(x=as.factor(Ounces), y=IBU)) + geom_point(position = "jitter")+
labs(title="IBU and OUNCES Scatterplot", x="Ounces")
## Warning: Removed 1005 rows containing missing values (geom_point).

ggsave(filename="Export/ounce_ABV.jpg")
## Saving 7 x 5 in image
## Warning: Removed 1005 rows containing missing values (geom_point).
beer_brewery %>%
ggplot(aes(x=as.factor(Ounces), y=ABV)) + geom_point(position = "jitter")+
labs(title="ABV and OUNCES Scatterplot", x="Ounces")
## Warning: Removed 62 rows containing missing values (geom_point).

Which state has the maximum alcoholic (ABV) beer?
#top 10 states for having high ABV beers
top_abv<-beer_brewery_byState %>%
arrange(desc(ABV_rating_high)) %>%
rename_all(funs(str_replace_all(., "_", " "))) %>%
head(10)
Which state has highest variance in ABV?
#top 10 states with the most variance in ABV
top_abv_sd<-beer_brewery_byState %>%
arrange(desc(ABV_sd)) %>%
rename_all(funs(str_replace_all(., "_", " "))) %>%
head(10)
#top 10 states with the least variance in ABV
bottom_abv_sd<-beer_brewery_byState %>%
arrange(ABV_sd) %>%
rename_all(funs(str_replace_all(., "_", " "))) %>%
head(10)
Which states have the most and least variance in beers?
#top 10 states with the most variance in ABV
top_abv_sd<-beer_brewery_byState %>%
arrange(desc(ABV_sd)) %>%
rename_all(funs(str_replace_all(., "_", " "))) %>%
head(10)
#top 10 states with the least variance in ABV
bottom_abv_sd<-beer_brewery_byState %>%
arrange(ABV_sd) %>%
rename_all(funs(str_replace_all(., "_", " "))) %>%
head(10)
#top 10 states for having high IBU beers
top_ibu<-beer_brewery_byState %>%
arrange(desc(IBU_median)) %>%
select(State, IBU_sd, IBU_sd_rank, IBU_median, IBU_rank, ABV_median, ABV_rank) %>%
rename_all(funs(str_replace_all(., "_", " "))) %>%
head(10)
top_ibu
## # A tibble: 10 x 7
## State `IBU sd` `IBU sd rank` `IBU median` `IBU rank` `ABV median`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ME 17.4 44 61 1 0.051
## 2 WV 19.1 42 57.5 2 0.062
## 3 FL 22.5 34 55 3.5 0.057
## 4 GA 16.2 47 55 3.5 0.055
## 5 DE NA 50 52 5 0.055
## 6 NM 36.7 3 51 6 0.062
## 7 NH 47.4 2 48.5 7 0.055
## 8 DC 50.9 1 47.5 8 0.0625
## 9 NY 23.4 30 47 9 0.055
## 10 AK 23.3 31 46 10 0.056
## # … with 1 more variable: `ABV rank` <dbl>
#top 10 states with the most variance in IBU
top_ibu_sd<-beer_brewery_byState %>%
arrange(desc(IBU_sd)) %>%
select(State, IBU_sd, IBU_sd_rank, IBU_median, IBU_rank, ABV_median, ABV_rank) %>%
rename_all(funs(str_replace_all(., "_", " "))) %>%
head(10)
top_ibu_sd
## # A tibble: 10 x 7
## State `IBU sd` `IBU sd rank` `IBU median` `IBU rank` `ABV median`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 DC 50.9 1 47.5 8 0.0625
## 2 NH 47.4 2 48.5 7 0.055
## 3 NM 36.7 3 51 6 0.062
## 4 CT 36.1 4 29 41.5 0.06
## 5 ID 35.5 5 39 21.5 0.0565
## 6 VT 34.4 6 30 38.5 0.055
## 7 NJ 31.9 7 34.5 29 0.046
## 8 OK 31.1 8 35 26.5 0.06
## 9 KS 30.0 9 20 49 0.05
## 10 PA 29.2 10 30 38.5 0.057
## # … with 1 more variable: `ABV rank` <dbl>
#top 10 states with the least variance in IBU
bottom_ibu_sd<-beer_brewery_byState %>%
arrange(IBU_sd) %>%
select(State, IBU_sd, IBU_sd_rank, IBU_median, IBU_rank, ABV_median, ABV_rank) %>%
rename_all(funs(str_replace_all(., "_", " "))) %>%
head(10)
bottom_ibu_sd
## # A tibble: 10 x 7
## State `IBU sd` `IBU sd rank` `IBU median` `IBU rank` `ABV median`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TN 15.9 48 37 24 0.0570
## 2 GA 16.2 47 55 3.5 0.055
## 3 LA 16.4 46 31.5 35.5 0.052
## 4 NE 17.2 45 35 26.5 0.056
## 5 ME 17.4 44 61 1 0.051
## 6 RI 17.9 43 24 44.5 0.055
## 7 WV 19.1 42 57.5 2 0.062
## 8 WA 19.9 41 38 23 0.0555
## 9 WY 20.4 40 21 47 0.05
## 10 MT 20.5 39 40 18.5 0.055
## # … with 1 more variable: `ABV rank` <dbl>
Summary statistics for the IBU variable.
summary(beerData$IBU)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 4.00 21.00 35.00 42.71 64.00 138.00 1005
ggplot(beerData, aes(x=IBU)) +
geom_histogram(position="identity", alpha=0.5)+
geom_vline(data=beerData,aes(xintercept=median(beerData$IBU)),linetype="dashed")+
labs(title="International Bitterness Unit(IBU) Histogram",x="IBU", y = "Count") +
geom_vline(aes(xintercept = median(IBU, na.rm=T)),linetype="dotted", color="black") + annotate("text", x = c(33), y = c(120), label = c(paste("Median", median(beerData$IBU, na.rm=T))) , color="black", size=5 , angle=90, fontface="bold")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1005 rows containing non-finite values (stat_bin).
## Warning: Removed 2410 rows containing missing values (geom_vline).

ggsave('Export/IBU_by_volume.jpg')
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1005 rows containing non-finite values (stat_bin).
## Warning: Removed 2410 rows containing missing values (geom_vline).
Summary statistics for the ABV variable.
summary(beerData$ABV)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00100 0.05000 0.05600 0.05977 0.06700 0.12800 62
ggplot(beerData, aes(x=ABV)) +
geom_histogram(position="identity", alpha=0.5)+
geom_vline(data=beerData,aes(xintercept=median(beerData$ABV)),linetype="dashed")+
labs(title="Alcohol By Volume(ABV) Histogram",x="ABV", y = "Count") +
geom_vline(aes(xintercept = median(ABV, na.rm=T)),linetype="dotted", color="black") + annotate("text", x = c(.053), y = c(150), label = c(paste("Median", median(beerData$ABV, na.rm=T))) , color="black", size=5 , angle=90, fontface="bold")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 62 rows containing non-finite values (stat_bin).
## Warning: Removed 2410 rows containing missing values (geom_vline).

ggsave('Export/abv_by_volume.jpg')
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 62 rows containing non-finite values (stat_bin).
## Warning: Removed 2410 rows containing missing values (geom_vline).
grid.arrange(
beer_brewery_byState %>%
ggplot(aes(x=Number_of_beers, y=ABV_median)) + geom_point() +
labs(title="Number of Beers and ABV"),
beer_brewery_byState %>%
ggplot(aes(x=Number_of_styles, y=ABV_median)) + geom_point() +
labs(title="Number of Styles and ABV"),
beer_brewery_byState %>%
ggplot(aes(x=Number_of_breweries, y=ABV_median)) + geom_point() +
labs(title="Number of Breweries and ABV"),
ncol=2, nrow=2) %>%
ggsave(filename="Export/beer_culture_ABV.jpg")
## Saving 7 x 5 in image

grid.arrange(
beer_brewery_byState %>%
ggplot(aes(x=Number_of_beers, y=IBU_median)) + geom_point() +
labs(title="Number of Beers and IBU"),
beer_brewery_byState %>%
ggplot(aes(x=Number_of_styles, y=IBU_median)) + geom_point() +
labs(title="Number of Styles and IBU"),
beer_brewery_byState %>%
ggplot(aes(x=Number_of_breweries, y=IBU_median)) + geom_point() +
labs(title="Number of Breweries and IBU"),
ncol=2, nrow=2) %>%
ggsave(filename="Export/beer_culture_IBU.jpg")
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Saving 7 x 5 in image

This code produces a custom Powerpoint generating slides with data calculated and plotted above
#Read baseline PowerPoint file
my_pres <- read_pptx('Visuals/presentation.pptx')
#view(layout_summary(my_pres))
#view(layout_properties(my_pres))
# States with the most brewries
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
ph_with(value = "States With the Most Breweries", location = ph_location(left = 3, top = .5, width = 8, height = 1)) %>%
ph_with_table_at(value = top_brewstate,
height = 5, width = 8, left = 3, top = 2,
last_row = FALSE, last_column = FALSE, first_row = TRUE)
# States with the least breweries
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
ph_with(value = "States With the Least Breweries", location = ph_location(left = 3, top = .5, width = 8, height = 1)) %>%
ph_with_table_at(value = bottom_brewstate,
height = 5, width = 8, left = 3, top = 2,
last_row = FALSE, last_column = FALSE, first_row = TRUE)
# How many NA's were found
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
ph_with(value = "What Data Was Missing?", location = ph_location(left = 3.5, top = .5, width = 8, height = 1)) %>%
ph_with(value = missing_info, location = ph_location(left = 2.5, top = 3, width = 12, height = 1))
#General Beer Statistics Map
filename = "Export/general_statistics_map.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
ph_with(value = "General Beer Statistics Map", location = ph_location(left = 3, top = .5, width = 10, height = 1)) %>%
ph_with(external_img(src = filename, width = 10, height = 6.5),
location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5))
#
filename = "Export/ABV_beers_map.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
ph_with(value = "ABV by State", location = ph_location(left = 2.5, top = .5, width = 10, height = 1)) %>%
ph_with(external_img(src = filename, width = 10, height = 6.5),
location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5))
#
filename = "Export/IBU_beers_map.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
ph_with(value = "IBU by State", location = ph_location(left = 2.5, top = .5, width = 10, height = 1)) %>%
ph_with(external_img(src = filename, width = 10, height = 6.5),
location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5))
#
filename = "Export/Ounces_data.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
ph_with(value = "Ounces by State", location = ph_location(left = 2.5, top = .5, width = 10, height = 1)) %>%
ph_with(external_img(src = filename, width = 10, height = 6.5),
location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5))
#Compute alcohol by volume
filename = "Export/abv_by_volume.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
ph_with(value = "", location = ph_location(left = 3, top = .5, width = 10, height = 1)) %>%
ph_with(external_img(src = filename, width = 10, height = 6.5),
location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5))
#Compute bitterness by volume
filename = "Export/ibu_by_volume.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
ph_with(value = "", location = ph_location(left = 3, top = .5, width = 10, height = 1)) %>%
ph_with(external_img(src = filename, width = 10, height = 6.5),
location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5))
#Relationship between Ounces and IBU
filename = "Export/ounce_IBU.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
ph_with(value = "Relationship between Ounces and IBU", location = ph_location(left = 2.5, top = .5, width = 10, height = 1)) %>%
ph_with(external_img(src = filename, width = 10, height = 6.5),
location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5))
#Compute the relationship between IBU and ABV
filename = "Export/ibu_abv_scat_before.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
ph_with(value = "Relationship Between ABV and IBU", location = ph_location(left = 2.5, top = .5, width = 10, height = 1)) %>%
ph_with(external_img(src = filename, width = 10, height = 6.5),
location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5))
#Compute the relationship between IBU and ABV
filename = "Export/ibu_abv_scat.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
ph_with(value = "Relationship Between ABV and IBU", location = ph_location(left = 2.5, top = .5, width = 10, height = 1)) %>%
ph_with(external_img(src = filename, width = 10, height = 6.5),
location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5))
#Relationship between State's beer activity and ABV
filename = "Export/beer_culture_ABV.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
ph_with(value = "Relationship between State's beer activity and ABV", location = ph_location(left = 2.5, top = .5, width = 10, height = 1)) %>%
ph_with(external_img(src = filename, width = 10, height = 6.5),
location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5))
#Relationship between State's beer activity and IBU
filename = "Export/beer_culture_IBU.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
ph_with(value = "Relationship between State's beer activity and IBU", location = ph_location(left = 2.5, top = .5, width = 10, height = 1)) %>%
ph_with(external_img(src = filename, width = 10, height = 6.5),
location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5))
#Summary slide
filename = "Visuals/summary.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
ph_with(value = "Summary and Opportunities", location = ph_location(left = 2.5, top = .5, width = 10, height = 1)) %>%
ph_with(external_img(src = filename, width = 13, height = 7.5),
location = ph_location(left = .15, top = .15, width = 13, height = 7.5))
#Repository slide
filename = "Visuals/repository.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
ph_with(external_img(src = filename, width = 13, height = 7.5),
location = ph_location(left = .15, top = .15, width = 13, height = 7.5))
# Print to save powerpoint
extension <- format(Sys.time(),'%b%d%H%M%S')
print(my_pres, target = paste("anheuser-budwieser", extension, "v1.pptx", sep='') )
## [1] "/Users/bjholmes/Library/Mobile Documents/com~apple~CloudDocs/DataScience/Doing Data Science/Unit_7/case_study_1/anheuser-budwieserJun27232438v1.pptx"